source: Alzheimer’s Disease Dataset

# Load necessary libraries
library(readr) # import data
library(ggplot2) # data visualization
library(plotly) # interactive data visualization
library(tidyr) # data manipulation
library(dplyr) # data manipulation

Data exploration and cleaning

# imoprt data
data <- read_csv("Rdataset.csv")
## Rows: 309 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): GENDER, LUNG_CANCER
## dbl (14): AGE, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC DISE...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# explore 
data
## # A tibble: 309 × 16
##    GENDER   AGE SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE `CHRONIC DISEASE`
##    <chr>  <dbl>   <dbl>          <dbl>   <dbl>         <dbl>             <dbl>
##  1 M         69       1              2       2             1                 1
##  2 M         74       2              1       1             1                 2
##  3 F         59       1              1       1             2                 1
##  4 M         63       2              2       2             1                 1
##  5 F         63       1              2       1             1                 1
##  6 F         75       1              2       1             1                 2
##  7 M         52       2              1       1             1                 1
##  8 F         51       2              2       2             2                 1
##  9 F         68       2              1       2             1                 1
## 10 M         53       2              2       2             2                 2
## # ℹ 299 more rows
## # ℹ 9 more variables: FATIGUE <dbl>, ALLERGY <dbl>, WHEEZING <dbl>,
## #   `ALCOHOL CONSUMING` <dbl>, COUGHING <dbl>, `SHORTNESS OF BREATH` <dbl>,
## #   `SWALLOWING DIFFICULTY` <dbl>, `CHEST PAIN` <dbl>, LUNG_CANCER <chr>
str(data)
## spc_tbl_ [309 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ GENDER               : chr [1:309] "M" "M" "F" "M" ...
##  $ AGE                  : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
##  $ SMOKING              : num [1:309] 1 2 1 2 1 1 2 2 2 2 ...
##  $ YELLOW_FINGERS       : num [1:309] 2 1 1 2 2 2 1 2 1 2 ...
##  $ ANXIETY              : num [1:309] 2 1 1 2 1 1 1 2 2 2 ...
##  $ PEER_PRESSURE        : num [1:309] 1 1 2 1 1 1 1 2 1 2 ...
##  $ CHRONIC DISEASE      : num [1:309] 1 2 1 1 1 2 1 1 1 2 ...
##  $ FATIGUE              : num [1:309] 2 2 2 1 1 2 2 2 2 1 ...
##  $ ALLERGY              : num [1:309] 1 2 1 1 1 2 1 2 1 2 ...
##  $ WHEEZING             : num [1:309] 2 1 2 1 2 2 2 1 1 1 ...
##  $ ALCOHOL CONSUMING    : num [1:309] 2 1 1 2 1 1 2 1 1 2 ...
##  $ COUGHING             : num [1:309] 2 1 2 1 2 2 2 1 1 1 ...
##  $ SHORTNESS OF BREATH  : num [1:309] 2 2 2 1 2 2 2 2 1 1 ...
##  $ SWALLOWING DIFFICULTY: num [1:309] 2 2 1 2 1 1 1 2 1 2 ...
##  $ CHEST PAIN           : num [1:309] 2 2 2 2 1 1 2 1 1 2 ...
##  $ LUNG_CANCER          : chr [1:309] "YES" "YES" "NO" "NO" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   GENDER = col_character(),
##   ..   AGE = col_double(),
##   ..   SMOKING = col_double(),
##   ..   YELLOW_FINGERS = col_double(),
##   ..   ANXIETY = col_double(),
##   ..   PEER_PRESSURE = col_double(),
##   ..   `CHRONIC DISEASE` = col_double(),
##   ..   FATIGUE = col_double(),
##   ..   ALLERGY = col_double(),
##   ..   WHEEZING = col_double(),
##   ..   `ALCOHOL CONSUMING` = col_double(),
##   ..   COUGHING = col_double(),
##   ..   `SHORTNESS OF BREATH` = col_double(),
##   ..   `SWALLOWING DIFFICULTY` = col_double(),
##   ..   `CHEST PAIN` = col_double(),
##   ..   LUNG_CANCER = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Checking for missing values

# Check for missing data for each column
colSums(is.na(data)) # no missing values
##                GENDER                   AGE               SMOKING 
##                     0                     0                     0 
##        YELLOW_FINGERS               ANXIETY         PEER_PRESSURE 
##                     0                     0                     0 
##       CHRONIC DISEASE               FATIGUE               ALLERGY 
##                     0                     0                     0 
##              WHEEZING     ALCOHOL CONSUMING              COUGHING 
##                     0                     0                     0 
##   SHORTNESS OF BREATH SWALLOWING DIFFICULTY            CHEST PAIN 
##                     0                     0                     0 
##           LUNG_CANCER 
##                     0

no missing values!! in the dataset

Data manipulation

# Remove Symptoms columns because they are not relevant to the study
data <- data %>% select(-'YELLOW_FINGERS',-'FATIGUE',-'ALLERGY',-'WHEEZING',-'COUGHING', -'SHORTNESS OF BREATH', -'SWALLOWING DIFFICULTY', -'CHEST PAIN')

# Rename columns
data <- data %>% rename(ALCOHOL_CONSUMING = 'ALCOHOL CONSUMING',CHRONIC_DISEASE = 'CHRONIC DISEASE')

# Convert GENDER to numeric
data$GENDER <- as.factor(data$GENDER)
data$GENDER_num <- as.numeric(data$GENDER)

# # Convert LUNG_CANCER to numeric
data$LUNG_CANCER <- as.factor(data$LUNG_CANCER)
data$LUNG_CANCER_num <- as.numeric(data$LUNG_CANCER)

str(data)
## tibble [309 × 10] (S3: tbl_df/tbl/data.frame)
##  $ GENDER           : Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
##  $ AGE              : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
##  $ SMOKING          : num [1:309] 1 2 1 2 1 1 2 2 2 2 ...
##  $ ANXIETY          : num [1:309] 2 1 1 2 1 1 1 2 2 2 ...
##  $ PEER_PRESSURE    : num [1:309] 1 1 2 1 1 1 1 2 1 2 ...
##  $ CHRONIC_DISEASE  : num [1:309] 1 2 1 1 1 2 1 1 1 2 ...
##  $ ALCOHOL_CONSUMING: num [1:309] 2 1 1 2 1 1 2 1 1 2 ...
##  $ LUNG_CANCER      : Factor w/ 2 levels "NO","YES": 2 2 1 1 1 2 2 2 1 2 ...
##  $ GENDER_num       : num [1:309] 2 2 1 2 1 1 2 1 1 2 ...
##  $ LUNG_CANCER_num  : num [1:309] 2 2 1 1 1 2 2 2 1 2 ...

Correlation matrix

# Plot correlation heatmap to identify relationships between variables
corr_data <- data %>% select_if(is.numeric) %>% cor()
corr_data

Heatmap

# Correlation heatmap
plot_ly(x = colnames(corr_data),
        y = rownames(corr_data),
        z = corr_data,
        type = "heatmap")
# Problem : ไม่สามารถหาค่าที่มีความสัมพันธ์กันอย่างมีนัยสำคัญได้จากการดู heatmap โดยตรง

for (i in 1:ncol(corr_data)) {
  for (j in 1:nrow(corr_data)) {
    if (i != j) {
      if (colnames(corr_data)[i] == "LUNG_CANCER_num" & rownames(corr_data)[j] != "LUNG_CANCER_num") {
          print(paste("Correlation between", colnames(corr_data)[i], "and", rownames(corr_data)[j], "is", corr_data[i, j]))
      }
    }
  }
}
## [1] "Correlation between LUNG_CANCER_num and AGE is 0.0894645760662337"
## [1] "Correlation between LUNG_CANCER_num and SMOKING is 0.0581788858520387"
## [1] "Correlation between LUNG_CANCER_num and ANXIETY is 0.144947132887312"
## [1] "Correlation between LUNG_CANCER_num and PEER_PRESSURE is 0.186387631715407"
## [1] "Correlation between LUNG_CANCER_num and CHRONIC_DISEASE is 0.110891094642414"
## [1] "Correlation between LUNG_CANCER_num and ALCOHOL_CONSUMING is 0.288532803091731"
## [1] "Correlation between LUNG_CANCER_num and GENDER_num is 0.0672541746783065"
# 3 ค่าที่มีความสัมพันธ์กันมากที่สุดเพื่อนำไป visualization คือ
# Correlation between ALCOHOL CONSUMING and LUNG_CANCER is 0.288532803091731
# Correlation between PEER_PRESSURE and LUNG_CANCER_num is 0.186387631715407
# Correlation between ANXIETY and LUNG_CANCER_num is 0.144947132887312

Data visualization

Bar plot : ALCOHOL CONSUMING and LUNG_CANCER

# tell the percentage for each bar in the plot at the top of the bar
ggplot(data, aes(x = ALCOHOL_CONSUMING, fill = LUNG_CANCER)) + geom_bar(position = "fill") + geom_text(stat = "count", aes(label = after_stat(count),group = LUNG_CANCER),position = position_fill(vjust = 0.5))

Bar plot : PEER_PRESSURE and LUNG_CANCER

ggplot(data, aes(x = PEER_PRESSURE, fill = LUNG_CANCER)) + geom_bar(position = "fill") + geom_text(stat = "count", aes(label = after_stat(count),group = LUNG_CANCER),position = position_fill(vjust = 0.5))

Bar plot : ANXIETY and LUNG_CANCER

ggplot(data, aes(x = ANXIETY, fill = LUNG_CANCER)) + geom_bar(position = "fill") + geom_text(stat = "count", aes(label = after_stat(count),group = LUNG_CANCER),position = position_fill(vjust = 0.5))

สรุปจาก data visualization พบว่ามีความสัมพันธ์ทางสถิติระหว่างตัวแปรค่อนข้างต่ำระหว่าง ALCOHOL CONSUMING, PEER PRESSURE และ ANXIETY ในผู้เป็นและไม่เป็นโรค LUNG CANCER จากการดู Visualization

Data analysis

# logistic regression model
data$LUNG_CANCER <- as.factor(data$LUNG_CANCER)
model <- glm(LUNG_CANCER ~ GENDER+ AGE+ SMOKING+ ANXIETY+ PEER_PRESSURE+ CHRONIC_DISEASE+ ALCOHOL_CONSUMING, data = data, family = "binomial")
summary(model)
## 
## Call:
## glm(formula = LUNG_CANCER ~ GENDER + AGE + SMOKING + ANXIETY + 
##     PEER_PRESSURE + CHRONIC_DISEASE + ALCOHOL_CONSUMING, family = "binomial", 
##     data = data)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -9.96635    2.23833  -4.453 8.48e-06 ***
## GENDERM           -0.22015    0.46688  -0.472 0.637261    
## AGE                0.01971    0.02286   0.862 0.388598    
## SMOKING            0.37898    0.40907   0.926 0.354213    
## ANXIETY            1.30636    0.46243   2.825 0.004728 ** 
## PEER_PRESSURE      1.67526    0.46296   3.619 0.000296 ***
## CHRONIC_DISEASE    1.43470    0.47169   3.042 0.002353 ** 
## ALCOHOL_CONSUMING  2.91488    0.58416   4.990 6.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 234.30  on 308  degrees of freedom
## Residual deviance: 169.29  on 301  degrees of freedom
## AIC: 185.29
## 
## Number of Fisher Scoring iterations: 6

สรุปจาก statistic analysis ปัจจัยที่ส่งผลให้เกิด Lung Cancer อย่างมาก (p-value < 0.001) ประกอบด้วย Peer pressure และ Alcohol consuming ปัจจัยที่ส่งผลให้เกิด Lung Cancer มาก (p-value < 0.01) ประกอบด้วย Anxiety และ Chorionic disease